// Scout.java
// A.L. Borchers, 1997 November
// University of Kentucky Department of Computer Science

package Scout;

import Logger.*;
import HTTPClient.*;
import SGMLKit.*;

import java.io.File;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;

import java.io.InterruptedIOException;
import java.io.IOException;

import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;

import java.net.URL;
import java.net.MalformedURLException;

import java.util.Date;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Vector;


public class Scout extends Thread {
  
  // -----------------------------------------------------------------------------
  // Scout 
  // -----------------------------------------------------------------------------
  
  // Agent information (returned by method credentials())
  public static final String agentName= "Scout";
  public static final String agentVersion= "0.3";
  
  // Headers that are added to defaults in HTTP requests (set in configure())
  public static NVPair[] scoutHeaders= null;
  
  // Loadable configuration parameters
  protected Config config= null;
  
  // GUI control panel
  private 	ScoutControls control= null;
  
  // -----------------------------------------------------------------------------
  // Run logging and persistent state support
  // -----------------------------------------------------------------------------
  
  // Default and member names for log file
  public static final String defaultLogFile= "Scout.log";
  private String logFile= null;
  
  // Default and member names for presisten data file
  public static final String defaultPersistFile= "Scout.dat";
  private String persistFile= null;
  
  // Logger for keeping track of status/errors
  public Logger logger= null;
  
  // Output stream for logger
  private 	FileOutputStream logOutputStream= null;
  
  // -----------------------------------------------------------------------------
  // Rule related objects
  // -----------------------------------------------------------------------------
  
  // RuleHash objects parsed from a SCOUT template
  protected Vector ruleSet= null;
  
  // Vector of rule threads
  protected Vector rules= null;
  
  // Accumulator for results reported by rules
  protected Results ruleResults= null;
  
  // -----------------------------------------------------------------------------
  // Network usage related objects
  // -----------------------------------------------------------------------------
  
  // Default size of buffer for content reads is 16K
  public static final int defaultReadBuffer= 16384;
  
  // Shared queue for URLs to search
  public URLQueue urls= null;
  
  // URL from which to begin search
  private URL startURL= null;
  
  // connection for chatting with Web hosts
  private HTTPConnection connection= null;
  
  // Robot exclusions monitor
  protected Nobots exclusions= null;
  
  // Whether to get the robots.txt file from hosts
  // in general, we want to do this, but it is useful to turn this 
  // off when working in the cache
  protected boolean requestRobotsFile= true;
  
  // Flags for whether to use built in exclusions in Nobots
  protected boolean useDefaultPathExclusions= true;
  protected boolean useDefaultTypeExclusions= true;
  
  // Flag whether to search the Web for documents that are not cached
  protected boolean searchWeb= false;
  
  // -----------------------------------------------------------------------------
  // Cache related items. These are protected so searching Rules and others that 
  // might need to examine the cache state are allowed in.
  // -----------------------------------------------------------------------------
  
  // CacheManager to buffer recent documents
  protected CacheManager cache= null;
  
  // Flag whether to search documents already cached
  protected boolean searchCache= false;
  
  // Record of URLs we've already searched on this run
  protected VisitRecord visits= null;
  
  // hostnames for deciding when we need new HTTPConnections and Nobots
  // (We only need new structures when host changes from pass to pass) 
  protected String currentHost= null, lastHost= null;
  
  // -----------------------------------------------------------------------------
  // Shared objects and timing data 
  // -----------------------------------------------------------------------------
  
  // DocBuffer where incoming text data is put
  protected DocBuffer buffer= null;
  
  // Flag exit status
  boolean done= false;
  
  // Times in milliseconds to stall Scout and Rules while waiting on each other
  public		final long defaultScoutDelay= 1000, defaultNetDelay= 5000;
  protected long scoutDelay= defaultScoutDelay, netDelay= defaultNetDelay;
  protected long startTime= 0, endTime= 0;
  
  // -----------------------------------------------------------------------------
  // Markup processing
  // -----------------------------------------------------------------------------
  
  // Entity table file for interpolated SGML entities
  private final String defaultEntityFile= "entities.txt";
  private String entityFile= defaultEntityFile; 
  
  // Splitter to get tags and text from markup
  protected static SGMLSplitter splitter= null;
  
  // -----------------------------------------------------------------------------
  // Stats tracking
  // -----------------------------------------------------------------------------
  
  // URL expand limit and stat tracking counters
  private int maxURLs= 0, discoveredURLs= 0, requestedURLs= 0, 
    ignoredURLs= 0, failedURLs= 0, failedURLStreams= 0;
  
  // -----------------------------------------------------------------------------
  // Constructor
  // -----------------------------------------------------------------------------
  // Given a configuration, rule set and a start URL, begin the search process
  // config 	= A configuration object consisting of sections and name=value pairs
  //						specifying operational parameters for this SCOUT instance
  // ruleSet	= A vector of RuleHash objects
  // startURL = A URL object containing the URL for starting the search
  // -----------------------------------------------------------------------------
  Scout(String confFile, String templateFile) {
    try {
      configure(confFile);
      // load the rules template
      ruleSet= loadTemplate(templateFile);
      // init the rules object
      rules= new Vector();
      // Determine which rule threads to start and start them
      if (ruleSet != null) {
        for (int i= 0; i < ruleSet.size(); i++) {
          // Get the rule named by the key with the parameters given
          RuleHash rh= (RuleHash)ruleSet.elementAt(i);
          Rule r= loadRule(rh);
          if (r != null) {
            rules.addElement(r);
          }
        }
      }
      logger.log("Scout.Scout - Loaded " + rules.size() + " rules");
      for (int i= 0; i < rules.size(); i++) {
        logger.log("Scout.Scout - " + ((Rule)rules.elementAt(i)).dumpConfig());
      }
      // Restore state information and get crawling
      if (!restoreState()) {
        // This will be caught below and cause abnormal exit
        throw new Exception();
      }
      // And they're off!
      start();
    }
    catch (Exception e) {
      logger.log("Scout.Scout - Error initializing Scout " + e.toString());
      e.printStackTrace();
      closeLogStream();
    }
  }
  
  // get the results produced by a rule named key processing docNumber
  // the r argument should be a this reference so the results object can 
  // log the call
  public synchronized Vector getRuleResults(Rule r, String key, int docNumber) {
    return ruleResults.get(r,key,docNumber);
  }
  
  public boolean visitedURL(String urlstr) {
    return visits.visited(urlstr);
  }

  public boolean visitedURL(URL url) {
    return visitedURL(url.toString());
  }
 
  // -----------------------------------------------------------------------------
  // configure
  // -----------------------------------------------------------------------------
  // Load the config object and initialize fundamental Scout vars and objects 
  // dependent on it 
  // -----------------------------------------------------------------------------
  // NOTE: We save some table lookup by using the local variables, but the config
  // table is still available with protected scope so that other classes can "add"
  // items to it. Exactly which items are worthy of elevation to a member variable
  // is a subject for debate. On the one hand, they speed up access to commonly 
  // used items. OTOH, they add a substantial amount of overhead to the Scout 
  // object and complicate communication between other classes and Scout runtime
  // paramters. Food for thought...
  // -----------------------------------------------------------------------------
  private void configure(String confFile) {
    // Load the configuration info
    try {
      config= new Config(confFile);
    }
    catch (IOException e) {
      System.err.println("Scout.configure - Error loading configuration file " + confFile + " - " + e.toString());
      System.exit(1);
    }
    // Show the config on stdout
    config.dump();
    // String for buffering conf items we need to check non-null
    String confItem= null;
    //
    // SCOUT Section of config
    //
    // Get the start URL
    // Set start URL
    confItem= config.get("SCOUT","StartURL");
    try {
      startURL= new URL(confItem);
    }
    catch (Exception e) {
      System.err.println("Scout.configure - Missing or malformed StartURL argument in config");
      System.exit(1);
    }
    // set the wait limit on Scout (when it has to wait on rules) and 
    // timing for "nice" net communication 
    confItem= config.get("SCOUT","ScoutDelay");
    scoutDelay= confItem == null ? defaultScoutDelay : Long.valueOf(confItem).longValue();
    confItem= config.get("SCOUT","NetDelay");
    netDelay= confItem == null ? defaultNetDelay : Long.valueOf(confItem).longValue();
    // set maximum number of URLs to examine
    confItem= config.get("SCOUT","MaxURLs");
    maxURLs= confItem == null ? 0 : Integer.parseInt(confItem);
    
    // set the cache directory
    confItem= config.get("SCOUT","CacheDir");
    // set whether to search cached documents
    searchCache= Boolean.valueOf(config.get("SCOUT","SearchCache")).booleanValue();
    
    // set the file to save state information to 
    confItem= config.get("SCOUT","PersistFile");
    persistFile= confItem == null ? defaultPersistFile : confItem;
    
    // set whether to search Web documents
    searchWeb= Boolean.valueOf(config.get("SCOUT","SearchWeb")).booleanValue();
    if (!searchCache && !searchWeb) {
      System.err.println("Scout.configure - SearchWeb or SearchCache must be initialized true");
      System.exit(-1);
    }
    
    // set whether to get robots.txt from hosts
    requestRobotsFile= Boolean.valueOf(config.get("SCOUT","RequestRobotsFile")).booleanValue();
    // set whether to use local or default exclusions
    confItem= config.get("SCOUT","UseDefaultPathExclusions");
    useDefaultPathExclusions= confItem == null ? 
useDefaultPathExclusions : Boolean.valueOf(confItem).booleanValue();
    confItem= config.get("SCOUT","UseDefaultTypeExclusions");
    useDefaultTypeExclusions= confItem == null ? 
useDefaultTypeExclusions : Boolean.valueOf(confItem).booleanValue();
    
    // set the log file from the config
    confItem= config.get("SCOUT","LogFile");
    logFile= confItem == null ? defaultLogFile : confItem;
    // Get the output stream for the log file
    try {
      logOutputStream= new FileOutputStream(logFile);
    }
    catch (IOException e) {
      System.err.println("Scout.configure - Couldn't open stream for logging to " + 
        logFile + " - " + e.toString());
      System.exit(-1);
    }
    // Decide to use GUI or silent logging
    if (Boolean.valueOf(config.get("SCOUT","UseGUI")).booleanValue()) {
      // Start the control panel and logger
      System.out.println("Scout.configure - Starting GUI components");
      logger= new WindowedLogger("SCOUT Activity",logOutputStream,20,80);
      control= new ScoutControls(this);
    }
    else {
      logger= new Logger(logOutputStream);
    }
    //
    // EXTRACTOR section - items specific to SGMLKit stuff
    // 
    // start the splitter we'll use to separate tags and text
    confItem=  config.get("EXTRACTOR","EntityFile");
    entityFile= confItem == null ? defaultEntityFile : confItem;
    splitter= new SGMLSplitter(entityFile);
    // Create headers that will be added to each request
    scoutHeaders= new NVPair[1];
    scoutHeaders[0]= new NVPair("User-Agent",credentials());
  }
  
  // -----------------------------------------------------------------------------
  // run method
  // -----------------------------------------------------------------------------
  // Loops until queue of URLs is empty or the maximum number of URLs to examine 
  // has been reached. For each URL collected, text and tags are separated, links
  // are extracted from tags and appended to queue, and text is buffered until 
  // each Rule has gained access to it.
  // -----------------------------------------------------------------------------
  public void run() {
    try {
      Date d= new Date();
      startTime= d.getTime();
      logger.log("Scout.run - Scout started at " + d.toString());
      // Seed the URL queue with the start target if it is empty
      // (i.e. if we aren't working from an active restore)
      if (urls.size() == 0) {
        urls.append(startURL);
      }
      // Create a DocBuffer to buffer discovered documents for the Rules
      buffer= new DocBuffer(this,rules.size());
      // Start the rules
      for (int i= 0; i < rules.size(); i++) ((Rule)rules.elementAt(i)).start();
      // While there are URLs to explore and we haven't reached the limit, explore them
      boolean searchLimitReached= false;
      URL nextURL= null;
      while (!searchLimitReached && (nextURL= (URL)urls.removeFront()) != null) {
        if (!visits.visited(nextURL.toString())) {
          // go ahead and add the URL to the visited records whether or not the URL 
          // fails on this run. This way, we will not attempt to reload sites that
          // caused problems in the current run. 
          visits.add(nextURL.toString());
          // Get the document object for this URL
          Document doc= getDocument(nextURL);
          // buffer Document for access by rules
          buffer.fill(doc);
          // If we haven't already, cache the document
          if (!cache.isCached(nextURL.toString())) {
            cache.cacheDocument(nextURL.toString(),doc);
          }
        }
        // set to break if we've already successfully loaded the limit of urls
        searchLimitReached= maxURLs > 0 && 
          (requestedURLs - failedURLs - failedURLStreams) >= maxURLs;
      }
      logger.log("Scout.run - URL queue exhausted. Shutting down buffer and exiting...");
      // shutdown the buffer (stalls the scout till rules get the current string)
      buffer.close();
      // wait till all rules report
      synchronized (this) {
        while (rulesLive()) {
          try {
            // logger.log("Scout.run - Waiting on rules to exit");
            wait(scoutDelay);
          }
          catch (InterruptedException e) {
          }
        }
      }
      // Log the data collected from the rules
      logResults();
      d= new Date();
      endTime= d.getTime();
    }
    catch (Exception e) {
      logger.log("Scout.run - Abnormal exit on exception " + e.toString());
      e.printStackTrace();
      // stop the rules
      for (int i= 0; i < rules.size(); i++) {
        ((Rule)rules.elementAt(i)).stop();
      }
    }
    // Save the final state
    saveState();
    long runSeconds= (endTime - startTime)/1000;
    logger.log("Scout.run - Finished after " + runSeconds/60 +
      " minutes " + runSeconds%60 + " seconds");
    closeLogStream();
    done= true;
  }
  
  // -----------------------------------------------------------------------------
  // logResults
  // -----------------------------------------------------------------------------
  // Write out contents of rule results and collection stats to the log
  // -----------------------------------------------------------------------------
  protected synchronized void logResults() {
    String[] keys= sortEnumeration(ruleResults.keys());
    if (keys != null) {
      for (int i= 0; i < keys.length; i++) {
        // Look up the rule and check how to report it's ouput
        Rule r= ruleLookup(keys[i]);
        if (r == null) {
          System.err.println("Error - no rule returned for name " + keys[i]);
        }
        else {
          String logOption= r.attr.get("logopt");
          if (logOption == null || logOption.equals("LOGALL")) {
            // dump the whole load
            Vector results= (Vector)ruleResults.get(keys[i]);
            for (int j= 0; j < results.size(); j++) {
              Vector out= (Vector)results.elementAt(j);
              for (int k= 0; k < out.size(); k++) {
                logger.log(keys[i] + "[" + j + "," + k + "]: " + out.elementAt(k).toString());
              }
            }
          }
          else if (logOption.equalsIgnoreCase("FINAL")) {
            // Log only last results reported
            Vector out= (Vector)ruleResults.get(keys[i]);
            int last= out.size()-1;
            out= (Vector)out.elementAt(last);
            for (int j= 0; j < out.size(); j++) {
              logger.log(keys[i] + "[" + last + "," + j + "]: " + out.elementAt(j).toString());
            }
          }
          else if (logOption.equals("NOLOG")) {
            logger.log("Scout.logResults - Results for " + keys[i] + " suppressed");
          }
        }
      }
    }
    logger.log("Scout.logResults - URL Stats:" +
      " discovered = " + discoveredURLs + 
      " requested = " + requestedURLs +
      " expanded = " + (requestedURLs - failedURLs - failedURLStreams) + 
      " ignored = " + ignoredURLs + 
      " failed = " + failedURLs +
      " error = " + failedURLStreams);
  }
  
  // -----------------------------------------------------------------------------
  // ruleLookup
  // -----------------------------------------------------------------------------
  // Lookup and return a rule by name
  // -----------------------------------------------------------------------------
  protected Rule ruleLookup(String name) {
    for (int i= 0; i < rules.size(); i++) {
      if (((Rule)rules.elementAt(i)).getName().equals(name)) {
        return (Rule)rules.elementAt(i);
      }
    }
    return null;
  }
  
  
  // -----------------------------------------------------------------------------
  // loadRule
  // -----------------------------------------------------------------------------
  // Create and start a rule instance, returning a reference to it
  // PRE: A valid template file tag has been parsed into the hash argument
  // -----------------------------------------------------------------------------
  private Rule loadRule(RuleHash ruleInit) {
    Rule r= null; 			// Rule to be returned
    try {
      // Determine if the type is executive command or rule application
      char type= ruleInit.get("type").toLowerCase().charAt(0);
      switch (type) {
      case 'c':
        // Executive command rule, execute it and return null.
        // The argument to any valid exec method is the template hash
        { // scope limit
          Class[] argTypes= { ruleInit.getClass() };
          Object[] args= { ruleInit };
          getClass().getMethod(ruleInit.get("name").toLowerCase(),argTypes).invoke(this,args);
        } // end scope limit
        break;
      case 'd':
        { // scope limit
          Class[] argTypes= { this.getClass(), ruleInit.getClass() };
          Object[] args= { this, ruleInit };
          // TODO:
          // The search order for classes is 1: runtime rulebase 2: scout rulebase 3: pwd
          // This will involve implementing a network class loader
          Constructor ruleConstructor= 
            Class.forName(ruleInit.get("rule")).getConstructor(argTypes);
          r= (Rule)ruleConstructor.newInstance(args);
        } // end scope limit
      case 'i':
        // ignore type
        break;
      default:
        throw new RuleFormatException("Unrecognized TYPE field");
      } // end type switch 
    } // end try
    // the following catch handles non-existent rules
    catch (ClassNotFoundException e) {
      logger.log("Scout.loadRule - Couldn't load rule " + 
        ruleInit.get("rule") + " " + e.toString());
      e.printStackTrace();
    }
    // The following exceptions arise from invoking exec commands
    // (possibly rule calls as well)
    catch (NoSuchMethodException e) {
      logger.log("Scout.loadRule - Couldn't invoke nonexistent method " + 
        ruleInit.get("name") + " " + e.toString());
      e.printStackTrace();
    }
    catch (InstantiationException e) {
      logger.log("Scout.loadRule - Error invoking method " + 
        ruleInit.get("name") + " " + e.toString());
      e.printStackTrace();
    }
    catch (InvocationTargetException e) {
      logger.log("Scout.loadRule - Error invoking method " + 
        ruleInit.get("name") + " " + e.toString());
      e.printStackTrace();
    }
    catch (IllegalAccessException e) {
      logger.log("Scout.loadRule - Security exception invoking method " + 
        ruleInit.get("name") + " " + e.toString());
      e.printStackTrace();
    }
    catch (RuleFormatException e) {
      logger.log("Scout.loadRule - Invalid rule data in rule " +
        ruleInit.toString() + " " + e.toString());
      e.printStackTrace();
    }
    return r;
  }
  
  // -----------------------------------------------------------------------------
  // getDocument(URL)
  // -----------------------------------------------------------------------------
  // Load the document specified by a URL from the cache or the Web as 
  // configuration and neccessity dictate. Returns a Document object flagged
  // invalid if any errors occur during the load.
  // -----------------------------------------------------------------------------
  private synchronized Document getDocument(URL url) {
    // Init with a null document which will be returned if we fail
    // to load from both cache and network
    Document out= new Document();
    out.setURL(url);
    out.setHTML(false);
    out.setValid(false);
    try {
      // Flag used to indicate that we should be polite to the network
      // Will be set false if we move on to a new host
      boolean nice= true;
      // set up new exclusions and connection if required
      currentHost= url.getHost();
      if (lastHost == null || !currentHost.equals(lastHost)) {
        // host different than the one we visited last so there's no reason to stall
        nice= false;
        exclusions.getHostExclusions(currentHost);
        connection= new HTTPConnection(url);
        // set the user agent header so webmasters will know who we are
        connection.setDefaultHeaders(scoutHeaders);
        lastHost= currentHost;
      }
      // Announce our intention to request
      logger.log("Scout.getDocument - Requesting URL " + url.toString() + " from cache");
      requestedURLs++;
      String urlStr= url.toString();
      if (searchCache && cache.isCached(urlStr)) {
        out= cache.getDocument(urlStr);
        logger.log("Scout.getDocument - Got URL " + url.toString() + " from cache");
      }
      else if (!searchWeb) {
        logger.log("Scout.getDocument - Web search is disabled...");
      }
      else {
        // note the delay in the log entry if we're going to wait
        logger.log("Scout.getDocument - URL not cached - hitting the Web " + 
          (nice ? ("in " + netDelay + "ms") : "now"));
        if (nice) try {
          wait(netDelay);
        }
        catch (InterruptedException e) {
        }
        // GET the URL
        HTTPResponse response= connection.Get(url.getFile());
        // read and store the headers into an array
        Enumeration headerKeys= response.listHeaders();
        Vector v= new Vector();
        while (headerKeys.hasMoreElements()) {
          String nextHeader= (String)headerKeys.nextElement();
          v.addElement(new NVPair(nextHeader,response.getHeader(nextHeader)));
        }
        // copy the vector into an array for storage in the document
        NVPair[] headers= new NVPair[v.size()];
        v.copyInto(headers);
        out.setHeaders(headers);
        // Check status code of response for setting document valid/invalid
        // A document is considered valid if the class of response code is 2XX
        boolean valid= response.getStatusCode()/100 == 2; 
        // print a diagnostic and increment failed URLs if invalid
        if (!valid) {
          logger.log("Scout.getDocument - Non 2XX HTTP response " + response.getStatusCode() + 
            " on " + url.toString() + " " + response.getReasonLine());
          failedURLs++;
        }
        else {
          // Report beginning of read with length of content if known
          String contentLength;
          try {
            contentLength= out.getHeader("Content-length");
          }
          catch (NoSuchHeaderException nshe) {
            contentLength= "unknown";
          }
          // logger.log("Scout.getDocument - HTTP response OK. Reading " + contentLength + " byte entity");
          try {
            // Get the stream for loading entity
            InputStream in= response.getInputStream();
            // logger.log("Scout.getDocument - got input stream for reading from network");
            // Read according to whether or not the content length is given, read in defaultReadBuffer units if not
            int contentBufferLength= contentLength.equals("unknown") ? defaultReadBuffer : Integer.parseInt(contentLength);
            byte[] readBuffer= null;
            readBuffer= readContent(in,contentBufferLength);
            // Add a content length header to the document if the server didn't provide
            if (contentLength.equals("unknown")) {
              contentLength= "" + readBuffer.length;
              out.addHeader(new NVPair("Content-length",contentLength));
            }
            // Log the read completion with actual content length read
            // logger.log("Scout.getDocument - Finished reading " + readBuffer.length + " byte entity");
            // Put the data in the document
            out.setBytes(readBuffer);
            // Since we can't always rely on a header to indicate HTML, we try to parse every 
            // document as such and let the result of the parse be our guide...
            Vector tags= new Vector();
            StringBuffer text= new StringBuffer();
            boolean splitStatus= false;
            try {
              splitStatus= splitter.split(new ByteArrayInputStream(out.getBytes()),tags,text);
            }
            catch (DocumentAccessException de) {
              // Should never happen as we just filled the document?!
              System.err.println("Scout.getDocument - Something has gone terribly wrong with this document...");
              System.exit(1);
            }
            logger.log("Scout.getDocument - " + 
              (splitStatus ? "Separated tags and text" : "Document not parseable as valid HTML"));
            // If we successfully parsed the document, save it in split form 
            // (the original can be recreated in normalized HTML using join)
            if (splitStatus) {
              out.setTags(tags);
              out.setText(text.toString());
            }
            out.setHTML(splitStatus);
            out.setValid(true);
          }
          catch (IOException e) {
            logger.log("Scout.getDocument - I/O Error processing URL stream (" + url.toString() + ") - " + 
              e.toString());
            out.setValid(false);
            failedURLStreams++;
          }
        }
      }
    }
    catch (ProtocolNotSupportedException e) {
      logger.log("Scout.getDocument - Failed connection to URL " + url.toString() + " - " + e.toString());
      failedURLStreams++;
    }
    catch (AuthTypeNotImplementedException e) {
      logger.log("Scout.getDocument - Failed connection to URL " + url.toString() + " - " + e.toString());
      failedURLStreams++;
    }
    catch (CacheException e) {
      logger.log("Scout.getDocument - Cache is corrupt: " + e.toString()); 
    }
    catch (InterruptedIOException e) {
      logger.log("Scout.getDocument - Socket timed out on " + url.toString() + " - " + e.toString());
    }
    catch (IOException e) {
      logger.log("Scout.getDocument - Failed connection to URL " + url.toString() + " - " + e.toString());
      failedURLStreams++;
    }
    return out;
}

// -----------------------------------------------------------------------------
// readContent
// -----------------------------------------------------------------------------
// Read an unknown content length from an input stream, returning it in a byte
// array
// -----------------------------------------------------------------------------
private byte[] readContent(InputStream in, int contentBufferLength) 
throws IOException {
  int bytesRead= 0, totalRead= 0;
  // outBuffer will buffer all that we've read so far
  byte[] readBuffer= new byte[contentBufferLength];
  byte[] outBuffer= new byte[contentBufferLength];
  // while reads return, append to out buffer
  while ((bytesRead= in.read(readBuffer)) > 0) {
    // Copy what we've read on this pass at last totalRead
    System.arraycopy(readBuffer,0,outBuffer,totalRead,bytesRead);
    // save a reference to current outBuffer
    byte[] temp= outBuffer;
    // Realloc outBuffer with an extra contentBufferLength
    outBuffer= new byte[totalRead + contentBufferLength];
    // Set totalRead to indicate where to put the next readBuffer
    totalRead+= bytesRead;
    // copy totalRead so far to new outBuffer
    System.arraycopy(temp,0,outBuffer,0,totalRead);
  }
  // Final result is totalRead long, 
  readBuffer= new byte[totalRead];
  System.arraycopy(outBuffer,0,readBuffer,0,totalRead);
  return readBuffer;
}

// -----------------------------------------------------------------------------
// closeLogStream
// -----------------------------------------------------------------------------
// Close the logger output stream when all logging is done
// -----------------------------------------------------------------------------
private void closeLogStream() {
		try {
      logOutputStream.close();
    }
    catch (IOException e) {
      System.err.println("Scout.closeLogStream - Error closing log file - " + e);
    }
}

// -----------------------------------------------------------------------------
// rulesLive
// -----------------------------------------------------------------------------
// Return true if any rule is still alive
// -----------------------------------------------------------------------------
protected boolean rulesLive() {
		for (int i= 0; i < rules.size(); i++) {
      if (((Rule)rules.elementAt(i)).isAlive()) {
        return true;
      }
    }
    return false;
}

// -----------------------------------------------------------------------------
// saveState
// -----------------------------------------------------------------------------
// Save the current state of the Scout session, including cache, rule results,
// search queue and visit record
// -----------------------------------------------------------------------------
protected boolean saveState() {
		boolean success= true;
    ObjectOutputStream out= null;
    // Store the cache
    try {
      cache.save();
    }
    catch (IOException e) {
      success= false;
      logger.log("Scout.run - Couldn't save cache...");
    }
    // Store other state objects
    try {
      out= new ObjectOutputStream(new FileOutputStream(persistFile));
      out.writeObject(exclusions);
      out.writeObject(urls);
      out.writeObject(visits);
      out.writeObject(ruleResults);
      out.close();
    }
    catch (IOException e) {
      success= false;
      logger.log("Scout.saveState - Error saving state information: " + e);
      try { out.close(); } catch (IOException ioe) {}
    }
    return success;
}

// -----------------------------------------------------------------------------
// restoreState
// -----------------------------------------------------------------------------
// Restore the contents of the cache and persist file
// -----------------------------------------------------------------------------
private boolean restoreState() {
		boolean success= true;
    ObjectInputStream in= null;
    // Restore the cache
    try {
      cache= new CacheManager(this);
      logger.log("Scout.restoreState - Initialized cache of " + cache.size() + " objects");
    } 
    catch (CacheException e) {
      success= false;
      logger.log("Scout.restoreState - Cache initialization failed: " + e.toString());
    }
    // Restore other state objects
    try {
      File f= new File(persistFile);
      if (f.exists()) {
        // Restore existing state
        in= new ObjectInputStream(new FileInputStream(persistFile));
        exclusions= (Nobots)in.readObject();
        exclusions.setScout(this);
        logger.log("Scout.restoreState - Restored robots exclusion data");
        urls= (URLQueue)in.readObject();
        urls.setScout(this);
        logger.log("Scout.restoreState - Restored queue of " + urls.size() + " URLs");
        visits= (VisitRecord)in.readObject();
        logger.log("Scout.restoreState - Restored " + visits.size() + " visit records");
        ruleResults= (Results)in.readObject();
        ruleResults.setScout(this);
        logger.log("Scout.restoreState - Restored " + ruleResults.size() + " result records");
        in.close();
      }
      else {
        // Create new state
        logger.log("Scout.restoreState - No state data found. Creating new objects");
        exclusions= new Nobots(useDefaultPathExclusions,useDefaultTypeExclusions);
        exclusions.setScout(this);
        urls= new URLQueue(this);
        visits= new VisitRecord();
        ruleResults= new Results(this);
      }
    }
    catch (Exception e) {
      success= false;
      logger.log("Scout.restoreState - Error restoring state information: " + e);
      e.printStackTrace();
    }
    return success;
}

// -----------------------------------------------------------------------------
// sortEnumeration(Enumeration)
// -----------------------------------------------------------------------------
// Convert enumeration into a sorted list of key Strings
// -----------------------------------------------------------------------------
private String[] sortEnumeration(Enumeration e) {
		String[] out= null;
    Vector v= new Vector();
    while (e.hasMoreElements())
      v.addElement(e.nextElement());
    if (v.size() > 0) {
      out= new String[v.size()];
      v.copyInto(out);
      // now selection sort
      for (int i= 0; i < out.length; i++) {
        int min= i;
        for (int j= i+1; j < out.length; j++) {
          if (out[j].compareTo(out[min]) < 0) {
            min= j;
          }
        }
        if (min != i) {
          String temp= out[i];
          out[i]= out[min];
          out[min]= temp;
        }
      }
    }
    return out;
}

// -----------------------------------------------------------------------------
// loadTemplate(String)
// -----------------------------------------------------------------------------
// load a template from a file into a vector of RuleHash objects and return 
// the vector or null if no template found or if an error occurs
// -----------------------------------------------------------------------------
private static Vector loadTemplate(String templateFile) {
		Vector out= null;
    try {
      FileInputStream in= new FileInputStream(templateFile);
      out= loadTemplate(in);
      in.close();
    }
    catch (Exception e){
      e.printStackTrace();
    }
    return out;
}

// -----------------------------------------------------------------------------
// loadTemplate(InputStream)
// -----------------------------------------------------------------------------
// Load RuleHashes from a template coming from a stream
// -----------------------------------------------------------------------------
private static Vector loadTemplate(InputStream templateStream) {
		Vector rules= new Vector();
    Vector tags= null;
    try {
      tags= splitter.getTags(templateStream);
      // Locate the beginning of the template
      boolean templateFound= false;
      Enumeration e= tags.elements();
      while (!templateFound && e.hasMoreElements()) {
        Tag nextTag= (Tag)e.nextElement();
        templateFound= nextTag.getIdentifier().equals("scout") && nextTag.containsKey("tstart");
      }
      if (!templateFound) {
        return null;
      }
      while (e.hasMoreElements()) {
        Tag nextTag= (Tag)e.nextElement();
        // if it isn't a scout tag, error case
        if (!nextTag.getIdentifier().equals("scout")) {
          return null;
        }
        // if it is the end of the template, break and return
        if (nextTag.containsKey("tend")) {
          break;
        }
        // else store the rule
        // System.out.println("loadTemplate - Adding rule " + nextTag);
        rules.addElement(new RuleHash(nextTag));
      }
    }
    // If an individual rule threw, report which
    catch (RuleFormatException rfe) {
      rfe.printStackTrace();
      rules= null;
    }
    // Otherwise, other exceptions come from the stream split process 
    catch (Exception e) {
      System.err.println("Scout.loadTemplate - Error while processing template stream");
      e.printStackTrace();
      rules= null;
    }
    return rules;
}

protected static String credentials() {
		return agentName + "/" + agentVersion;
}

// METHODS BELOW THIS LINE ARE THOSE CALLED DYNAMICALLY BY TEMPLATES

// -----------------------------------------------------------------------------
// setvar
// -----------------------------------------------------------------------------
// Set a RUNTIME configuration variable in response to a SETVAR template item
// -----------------------------------------------------------------------------
public void setvar(RuleHash rule) {
		String arg= rule.get("value");
    if (arg == null) {
      logger.log("Scout.setvar - Error setting runtime variable - null value in template");
      return;
    }
    int split= arg.indexOf("=");
    if (split < 1 || arg.length() < 3) {
      logger.log("Scout.setvar - Error setting runtime variable - cannot resolve NV pair in value field " + arg);
      return;
    }
    String key= arg.substring(0,split), val= arg.substring(split+1);
    config.set("RUNTIME",key,val);
    logger.log("Scout.setvar - Set runtime variable " + arg);
}

// -----------------------------------------------------------------------------
// templatebase
// -----------------------------------------------------------------------------
// Set the runtime template base URL
// -----------------------------------------------------------------------------
public void templatebase(RuleHash rule) {
		String arg= rule.get("value");
    if (arg == null) {
      logger.log("Scout.templatebase - Error setting runtime templatebase - null field in rule");
    }
    else { 
      // strip the terminal / if present
      arg= arg.endsWith("/") ? arg.substring(0,arg.length()-1) : arg;
      config.set("RUNTIME","templatebase",arg);
      logger.log("Scout.templatebase - Set runtime template base URL to arg");
    }
}

// -----------------------------------------------------------------------------
// rulebase
// -----------------------------------------------------------------------------
// Set the runtime rule base URL
// -----------------------------------------------------------------------------
public void rulebase(RuleHash rule) {
		String arg= rule.get("value");
    if (arg == null) {
      logger.log("Scout.rulebase - Error setting runtime rulebase - null field in rule");
    }
    else {
      // strip the terminal / if present
      arg= arg.endsWith("/") ? arg.substring(0,arg.length()-1) : arg;
      config.set("RUNTIME","rulebase",arg);
      logger.log("Scout.rulebase - Set runtime rule base URL to arg");
    }
}

// -----------------------------------------------------------------------------
// Main program usage: java Scout.Scout startURL templateFile
// -----------------------------------------------------------------------------
public static void main(String[] args) {
		try {
      String iniFile= args[0];
      String templateFile= args[1];
      Scout s= new Scout(iniFile, templateFile);
    }
    catch (Exception e) {
      e.printStackTrace();
    }
}

}
